; Calculating Bezier objects;
; Implemented in Flat assembler by Maciej Guba,
; http://macgub.co.pl
; Requires sse2/sse4 - variant code

; I use this equation for Bezier patch:
; x(s,t)=(1-s)^3*(x11*a+x12*b+x13*c+x14*d)+
;        +3*(1-s)^2*s*(x21*a+x22*b+x23*c+x24*d)+
;        +3*(1-s)*s^2*(x31*a+x32*b+x33*c+x34*d)+
;        +s^3*(x41*a+x42*b+x43*c+x44*d)
; y(s,t) and z(s,t) similar...
;  a = (1-t)^3
;  b = 3*(1-t)^2*t
;  c = 3*(1-t)*t^2
;  d = t^3

; for volume:
; x(s,t,r)=(1-s)^3*(x11*a+x12*b+x13*c+x14*d)+
;          +3*(1-s)^2*s*(x21*a+x22*b+x23*c+x24*d)+
;          +3*(1-s)*s^2*(x31*a+x32*b+x33*c+x34*d)+
;          +s^3*(x41*a+x42*b+x43*c+x44*d)
; ......
; x11*a =  (x111*A+x112*B+x113*C+x114*D)*a
; x12*b =  (x121*A+x122*B+x123*C+x124*D)*b
; ......
;  A = (1-r)^3
;  B = 3*(1-r)^2*r
;  C = 3*(1-r)*r^2
;  D = r^3

; a - b - similar
; s, t  - similar
calc_tensor:
; in:
;    eax - factors aligned  - abcd
;    edx - 64 nodes, each node - float
;    ebx - middle sum
; out:
;    ebx - mid sum  - 64 bytes

  .x11 equ [edx]              ;      .x111-x114
  .x12 equ [edx+16]           ;      .x121-x124
  .x13 equ [edx+32]           ;      .x131-x134
  .x14 equ [edx+48]           ;      .x141-x144
  .f11 equ [ebx]              ;      .f14-f4

   movups   xmm0,[eax]        ;  dpps xmm0,xmm0,00110111b
                              ;  part 0011b - addition, part 0111b - spread
   movaps   xmm1,xmm0
   movaps   xmm2,xmm0
   movaps   xmm3,xmm0
   movaps   xmm4,xmm0
;if (Ext = sse3)
;   xorps    xmm5,xmm5
;   mulps    xmm0,.x11
;   mulps    xmm1,.x12
;   mulps    xmm2,.x13
;   mulps    xmm3,.x14
;   haddps   xmm1,xmm1
;   haddps   xmm2,xmm2
;   haddps   xmm3,xmm3
;   haddps   xmm0,xmm0
;   haddps   xmm1,xmm1
;   haddps   xmm2,xmm2
;   haddps   xmm3,xmm3
;   haddps   xmm0,xmm0
;   movlhps  xmm0,xmm5
;   movlhps  xmm1,xmm5
;   movlhps  xmm2,xmm5
;   movlhps  xmm3,xmm5
;   shufps   xmm0,xmm0,11111100b
;   shufps   xmm1,xmm1,11110011b
;   shufps   xmm2,xmm2,11001111b
;   shufps   xmm3,xmm3,00111111b
;   orps     xmm1,xmm0
;   orps     xmm3,xmm2
;   orps     xmm3,xmm1
;   movaps   .f11,xmm3
;ei
if Ext = sse4

   dpps     xmm0,.x11,11110001b
   dpps     xmm1,.x12,11110010b
   dpps     xmm2,.x13,11110100b
   dpps     xmm3,.x14,11111000b
   orps     xmm1,xmm0
   orps     xmm3,xmm2
   orps     xmm3,xmm1
   movaps   .f11,xmm3
else if (Ext = sse2) | (Ext = sse3)
   mulps    xmm0,.x11
   mulps    xmm1,.x12
   mulps    xmm2,.x13
   mulps    xmm3,.x14
   movaps   xmm5,xmm1
   movhlps  xmm5,xmm0
   movlhps  xmm0,xmm1
   addps    xmm0,xmm5
   movaps   xmm5,xmm0
   shufps   xmm5,xmm5,10110001b
   addps    xmm0,xmm5
   shufps   xmm0,xmm0,11111000b
   movaps   xmm5,xmm3
   movhlps  xmm5,xmm2
   movlhps  xmm2,xmm3
   addps    xmm2,xmm5
   movaps   xmm5,xmm2
   shufps   xmm5,xmm5,11111101b
   addps    xmm2,xmm5
   shufps   xmm2,xmm2,11111000b
   movlhps  xmm0,xmm2
   movaps   .f11,xmm0
end if

ret
;===============================================================
bezier_volume:

; makes bezier volume through tensor product
      ; xm0, xm1, xm2 - r, s, t

       push     ebp
       mov      ebp,esp
       sub      esp,150
       and      ebp,-16

  .x11 equ  [esi]           ;      .x111-x114
  .x12 equ  [esi+16]        ;      .x121-x124
  .x13 equ  [esi+32]        ;      .x131-x134
  .x14 equ  [esi+48]        ;      .x141-x144

  .x21 equ  [esi+16*4]      ;      .x211-x214
  .x22 equ  [esi+16*5]      ;      .x221-x224
  .x23 equ  [esi+16*6]      ;      .x231-x234
  .x24 equ  [esi+16*7]      ;      .x241-x244

  .x31 equ  [esi+16*8]      ;      .x311-x314
  .x32 equ  esi+16*9        ;      .x321-x324
  .x33 equ  esi+16*10       ;      .x331-x334
  .x34 equ  esi+16*11       ;      .x341-x344

  .x41 equ  [esi+16*12]     ;      .x411-x414
  .x42 equ  [esi+16*13]     ;      .x421-x424
  .x43 equ  [esi+16*14]     ;      .x431-x434
  .x44 equ  [esi+16*15]     ;      .x441-x444

  .y11 equ  [esi+16*16]     ;      .y111-y114
  .y12 equ  [esi+16*17]     ;      .y121-y124
  .y13 equ  [esi+16*18]     ;      .y131-y134
  .y14 equ  [esi+16*19]     ;      .y141-y144

  .y21 equ  [esi+16*20]     ;      .y211-y214
  .y22 equ  [esi+16*21]     ;      .y221-y224
  .y23 equ  [esi+16*22]     ;      .y231-y234
  .y24 equ  [esi+16*23]     ;      .y241-y244

  .y31 equ  [esi+16*24]     ;      .y311-y314
  .y32 equ  esi+16*25       ;      .y321-y324
  .y33 equ  esi+16*26       ;      .y331-y334
  .y34 equ  esi+16*27       ;      .y341-y344

  .y41 equ  [esi+16*28]     ;      .y411-y414
  .y42 equ  [esi+16*29]     ;      .y421-y424
  .y43 equ  [esi+16*30]     ;      .y431-y434
  .y44 equ  [esi+16*31]     ;      .y441-y444

  .z11 equ  [esi+16*32]     ;      .z111-z114
  .z12 equ  [esi+16*33]     ;      .z121-z124
  .z13 equ  [esi+16*34]     ;      .z131-z134
  .z14 equ  [esi+16*35]     ;      .z141-z144

  .z21 equ  [esi+16*36]     ;      .z211-z214
  .z22 equ  [esi+16*37]     ;      .z221-z224
  .z23 equ  [esi+16*38]     ;      .z231-z234
  .z24 equ  [esi+16*39]     ;      .z241-z244

  .z31 equ  [esi+16*40]     ;      .z311-z314
  .z32 equ  esi+16*41       ;      .z321-z324
  .z33 equ  esi+16*42       ;      .z331-z334
  .z34 equ  esi+16*43       ;      .z341-z344

  .z41 equ  [esi+16*44]     ;      .z411-z414
  .z42 equ  [esi+16*45]     ;      .z421-z424
  .z43 equ  [esi+16*46]     ;      .z431-z434
  .z44 equ  [esi+16*47]     ;      .z441-z444

  .r   equ  [ebp-4]
  .s   equ  [ebp-8]
  .t   equ  [ebp-12]
  .calc_tensor  equ dword[ebp-16]

  .d   equ   [ebp-20]
  .c   equ   [ebp-24]
  .b   equ   [ebp-28]
  .a   equ   [ebp-32]

  .d1  equ   [ebp-36]
  .c1  equ   [ebp-40]
  .b1  equ   [ebp-44]
  .a1  equ   [ebp-48]

  .d2  equ   [ebp-52]
  .c2  equ   [ebp-56]
  .b2  equ   [ebp-60]
  .a2  equ   [ebp-64]

  .f11 equ   [ebp-80]
  .f12 equ   [ebp-96]
  .f13 equ   [ebp-112]
  .f14 equ   [ebp-128]

   movss    .r,xmm2
   movss    .t,xmm1
   movss    .s,xmm0
   mov      .calc_tensor,calc_tensor
   push     edi
   lea      edi,.a
   movss    xmm0,.t
   call     calc_bez_factors
   lea      edi,.a1
   movss    xmm0,.s
   call     calc_bez_factors
   lea      edi,.a2
   movss    xmm0,.r
   call     calc_bez_factors
   pop      edi

   lea      eax,.a2
   lea      ebx,.f11
   lea      edx,.x11
   call     .calc_tensor
   lea      eax,.a2
   lea      ebx,.f12
   lea      edx,.x21
   call     .calc_tensor
   lea      eax,.a2
   lea      ebx,.f13
   lea      edx,.x31
   call     .calc_tensor
   lea      eax,.a2
   lea      ebx,.f14
   lea      edx,.x41
   call     .calc_tensor
   lea      eax,.a1
   lea      ebx,.f14
   lea      edx,.f14
   call     .calc_tensor

   movaps   xmm3,.f14
   mulps    xmm3,.a
   haddps   xmm3,xmm3
   haddps   xmm3,xmm3
   ; dpps     xmm3,.a,0xff
   movss    [edi],xmm3

   lea      eax,.a2
   lea      ebx,.f11
   lea      edx,.y11
   call     .calc_tensor
   lea      eax,.a2
   lea      ebx,.f12
   lea      edx,.y21
   call     .calc_tensor
   lea      eax,.a2
   lea      ebx,.f13
   lea      edx,.y31
   call     .calc_tensor
   lea      eax,.a2
   lea      ebx,.f14
   lea      edx,.y41
   call     .calc_tensor
   lea      eax,.a1
   lea      ebx,.f14
   lea      edx,.y41
   call     .calc_tensor

   movaps   xmm3,.f14
   mulps    xmm3,.a
   haddps   xmm3,xmm3
   haddps   xmm3,xmm3
   ; dpps     xmm3,.a,0xff
   movss    [edi+4],xmm3

   lea      eax,.a2
   lea      ebx,.f11
   lea      edx,.z11
   call     .calc_tensor
   lea      eax,.a2
   lea      ebx,.f12
   lea      edx,.z21
   call     .calc_tensor
   lea      eax,.a2
   lea      ebx,.f13
   lea      edx,.z31
   call     .calc_tensor
   lea      eax,.a2
   lea      ebx,.f14
   lea      edx,.z41
   call     .calc_tensor
   lea      eax,.a1
   lea      ebx,.f14
   lea      edx,.f14
   call     .calc_tensor

   movaps   xmm3,.f14
   mulps    xmm3,.a
   haddps   xmm3,xmm3
   haddps   xmm3,xmm3
   ; dpps     xmm3,.a,0xff
   movss    [edi+8],xmm3

   add     esp,150
   pop     ebp
ret
;==========================================================
calc_bez_factors:
; in:
; xmm0 - factor value - 't'
; out:
; edi - a, b, c, d
; destroys xmm0 - xmm4

;  a = (1-t)^3
;  b = 3*(1-t)^2*t
;  c = 3*(1-t)*t^2
;  d = t^3
   .aB  equ [edi]
   .bB  equ [edi+4]
   .cB  equ [edi+8]
   .dB  equ [edi+12]

   movlps  xmm7,[f3]
   movlps  xmm1,[the_one]
   subps   xmm1,xmm0
   movaps  xmm2,xmm1
   movaps  xmm3,xmm1
   movaps  xmm4,xmm0
   mulps   xmm2,xmm1
   mulps   xmm1,xmm7 ;[f3]
   mulps   xmm3,xmm2
   mulps   xmm2,xmm7 ;[f3]
   mulps   xmm4,xmm4
   mulps   xmm2,xmm0
   movlhps xmm2,xmm3
   mulps   xmm1,xmm4
   mulps   xmm4,xmm0 ;.t
   movlhps xmm4,xmm1
   shufps  xmm2,xmm4,00100010b
   movups  .aB,xmm2
ret
;============================================================================
bezier_surface:
 ; makes bezier surface through tensor product
        ; if eax = 'offd'  (Only Free Form Deformation) - calc only one
        ; vertex value  - x,y,z
        ;        xmm0 - t
        ;        xmm1 - s
        ; esi 16 nodes: x12,..x12,....x44,....z44 - see definitions below
        ; edi - vertices
        ; ebx - triangles
        ; if eax <> 'offd'
        ;   edx - patch quality dimensions  as word integer: x shl 16 + y

        push     ebp
        mov      ebp,esp
        sub      esp,170
        and      ebp,-16

   .tB            equ [ebp-4]
   .sB            equ [ebp-8]
   .ctB           equ dword [ebp-12] ; integer counter t
   .csB           equ dword [ebp-16] ; int counter s
   .vert_countBB  equ dword [ebp-20]
   .tri_countBB   equ dword [ebp-24]
   .x1 equ [esi]         ;      .x11-x14
;   .x2 equ esi+16        ;      .x21-x24
;   .x3 equ esi+32        ;      .x31-x34
;   .x4 equ esi+48        ;      .x41-x44
   .y1 equ [esi+64]      ;      .y11-y14
;   .y2 equ esi+80        ;      .y21-y24
;   .y3 equ esi+96        ;      .y31-y34
;   .y4 equ esi+112       ;      .y41-y44
   .z1 equ [esi+128]     ;      .z11-z14
;  .z2 equ eax+16       ;      .z21-z24
;  .z3 equ eax+32       ;      .z31-z34
;  .z4 equ eax+48       ;      .z41-z44

   .dsB         equ      [ebp-28]     ; delta s
   .dtB         equ      [ebp-32]     ; delta t
   .d           equ      [ebp-36]
   .c           equ      [ebp-40]
   .b           equ      [ebp-44]
   .a           equ      [ebp-48]
   .abcd        equ      [ebp-48]
   .d1          equ      [ebp-52]
   .c1          equ      [ebp-56]
   .b1          equ      [ebp-60]
   .a1          equ      [ebp-64]
   .abcd1       equ      [ebp-64]
   .one_v       equ dword[ebp-68]
   .tris_ptr    equ dword[ebp-72]
   .calc_tensor equ dword[ebp-76]
   .qfac        equ dword[ebp-80]
   .f11         equ      [ebp-(80+16)]            ;      .x111-x114
   .f12         equ      [ebp-(96+16)]            ;      .x121-x124
   .f13         equ      [ebp-(112+16)]           ;      .x131-x134
   .f14         equ      [ebp-(128+16)]
   .vec_start   equ      [ebp-(128+20)]
   .dest        equ      [ebp-(128+20])
   mov      .vec_start,edi
   mov      .qfac,edx             ; quality factor
   mov      .one_v,eax
   mov      .tris_ptr,ebx
   mov      .calc_tensor,calc_tensor
   movss    .tB,xmm1
   movss    .sB,xmm0
   xchg     ecx,eax
   ; lea    eax,[esi+128]
   cmp      ecx,'offd'
   je       .ffd1
   movzx    ecx,dx
   mov      .csB,ecx
   ; sub    .cs,1
   cvtsi2ss xmm1,ecx
   rcpss    xmm1,xmm1
   movss    .dsB,xmm1
   shr      edx,16
   mov      .ctB,edx
   ; sub    .ctB,1
   cvtsi2ss xmm0,edx
   rcpss    xmm0,xmm0
   movss    .dtB,xmm0
   xor      ecx,ecx
   mov      .sB,ecx
   mov      .tB,ecx

   mov      .vert_countBB,ecx
   mov      .tri_countBB,ecx
 .ffd1:
 ;  push     ebx
 .looptt:
   push     ecx
   push     edi
   lea      edi,.abcd
   movss    xmm0,.tB
   call     calc_bez_factors
   pop      edi
   xor      ecx,ecx
   cmp      .one_v,'offd'
   je       .loopss
   mov      dword .sB,ecx
 .loopss:
   push     ecx
   movss    xmm0,.sB
   push     edi
   lea      edi,.abcd1
   call     calc_bez_factors
   pop      edi
   ; tensor calling cov
   ; in:
   ;   eax - factors aligned  - abcd
   ;   edx - 64 nodes, each node - float
   ;   ebx - middle sum
   ; out:
   ;   ebx - mid sum  - 64 bytes
   lea      eax,.a1
   lea      ebx,.f14
   lea      edx,.x1
   call     .calc_tensor
   lea      eax,.a1
   lea      ebx,.f13
   lea      edx,.y1
   call     .calc_tensor
   lea      eax,.a1
   lea      ebx,.f12
   lea      edx,.z1
   call     .calc_tensor
   lea      eax,.a
   lea      edx,.f14
   lea      ebx,.f11
   call     .calc_tensor
   push     esi
   lea      esi,.f11
   movsd
   movsd
   movsd
   pop      esi
   movlps   xmm0,.sB
   addss    xmm0,.dsB
   movss    .sB,xmm0
   inc      .vert_countBB
   pop      ecx
   inc      ecx
   movss    xmm7,.csB
   cmp      .one_v,'offd'
   je       .break
   cmp      ecx,.csB
   jb       .loopss
   movlps   xmm1,.tB
   addss    xmm1,.dtB
   movss    .tB,xmm1
   pop      ecx
   inc      ecx
   mov      eax,.ctB
   inc      eax
   cmp      ecx,eax ;.ctB
   jb      .looptt
   ; now vertices described one Bezier surface
   ; are just calculated

   ; calc triangles list
;   mov     edx,.ctB
;   mov     esi,.tris_ptr
;   xor     ebx,ebx
; .oop1:
;   mov     ecx,.csB
;  @@:
;   mov     eax,ebx
;   mov     [esi],eax
;   inc     eax
;   mov     [esi+20],eax
;   mov     [esi+8],eax
;   add     eax,.csB
;   mov     [esi+4],eax
;   mov     [esi+12],eax
;   inc     eax
;   mov     [esi+16],eax
;   add     esi,24
;   inc     ebx
;   add     .tri_countBB,2
;   loop    @b
;   inc     ebx
;   dec     edx
;   jnz     .oop1

   cld
   mov    edx,.ctB
   mov    esi,.tris_ptr  ;ebx
   xor    ebx,ebx
 .oop1:
   mov    ecx,.csB
   dec    ecx
  @@:
   push   ebx
   mov    [esi],ebx
   mov    eax,ebx
   inc    eax
   mov    [esi+4],eax
   add    ebx,.csB
   mov    [esi+8],ebx
   mov    [esi+12],ebx
   mov    [esi+16],eax
   inc    ebx
   mov    [esi+20],ebx
   pop    ebx
   inc    ebx
   add    esi,24
   add    .tri_countBB,2
   loop   @b

   dec    dword[esi-8]
   inc    ebx
   dec    edx
   jnz    .oop1
;   mov    esi,.tri_countBB
;   imul   esi,12
;   add    esi,.tris_ptr  ;ebx

   mov    eax,8
   xor    ebx,ebx
   mov    ecx,8
 @@:
   mov    [esi],eax
   inc    eax
   mov    [esi+4],eax ;ebx
   mov    [esi+8],ebx
   add    esi,12
   add    eax,7
   add    ebx,8
   inc    .tri_countBB
   loop   @b

   ; esi - end of computed triangles list ptr
   ; edi - end of calculated vertices ptr
   mov      ecx,.vert_countBB
   mov      edx,.tri_countBB

   add      esp,170
   pop      ebp
ret
  .break:
    add     esp,174
    pop     ebp
ret
;==========================================================
draw_bezier_derives:
  ; esi - source  derives nodes
  ; ebx - source derv indexes (rectangles)
  ; edi - screen buffer
  ; .screen equ [ebp-4]
  ; uses some globals
     push     ebp
     mov      ebp,esp
     sub      esp,32

     .yres     equ [ebp-4]
     .xres     equ [ebp-8]
     .scr      equ [ebp-12]                    ; .scr
     .zbuff    equ [ebp-16]                    ; .zbu
     .tex_ptr  equ dword[ebp-20]               ; .linetexptr
     .width    equ [ebp-24]                    ; .width
     mov      eax,xres_var
     movzx    edx,word[eax]
     movzx    eax,word[eax+2]
     mov      .xres,edx
     mov      .yres,eax
     mov      .width,edx
     mov      .tex_ptr,0x0000ff00           ; col
     mov      .scr,edi
     push     esi
     push     ebx
     mov      ecx,4
  .again1:
     push     ecx
     mov      ecx,3
   @@:
     push     ecx ebx esi
     movzx    eax,word[ebx]   ; .counter
     movzx    edx,word[ebx+2] ; .counter
     imul     edx,12
     imul     eax,12
     movlps   xmm2,[eax+esi]
     movhps   xmm2,[edx+esi]
     cvtps2dq xmm2,xmm2
     ; eax=x1 ebx=y1 ecx=x2 edx=y2
     sub      esp,16
     movups   [esp],xmm2
     pop      eax ebx ecx edx
     movups   xmm5,.width
     pxor     xmm6,xmm6
     movlps   xmm7,.xres
     movlhps  xmm7,xmm7
     mov      edi,plain_horizontal
     call     line_grd_tex
     pop      esi ebx ecx
     add      ebx,2
     loop     @b
     pop      ecx
     add      ebx,2
     loop     .again1
     pop      ebx
     pop      esi
     mov      ecx,12
  .again3:
     push     esi ecx ebx
     movzx    eax,word[ebx]
     movzx    edx,word[ebx+8]
     imul     eax,12
     imul     edx,12
     movlps   xmm2,[esi+eax]
     movhps   xmm2,[esi+edx]
     cvtps2dq xmm2,xmm2
     ; eax=x1 ebx=y1 ecx=x2 edx=y2
     sub      esp,16
     movups   [esp],xmm2
     pop      eax ebx ecx edx
     movups   xmm5,.width
     pxor     xmm6,xmm6
     movlps   xmm7,.xres
     movlhps  xmm7,xmm7
     mov      edi,plain_horizontal
     call     line_grd_tex
     pop      ebx ecx esi
     add      ebx,2
     loop     .again3
  .en:
     mov      esp,ebp
     pop      ebp

ret
;====================================================
draw_bezier_bars:
; in:
;  esi - ptr to derives vertices list
;  ecx - number of nodes
;  globals - screen_ptr
   push     ebp
   mov      ebp,esp
   .ym4     equ dword[ebp-4]
   .cnt     equ dword[ebp-8]
   movzx    edx,[xres_var]
   movzx    edi,[yres_var]
   sub      edx,4
   sub      edi,4
   push     edi
   push     ecx
   xor      ecx,ecx
 @@:
   push     esi edx ebx ecx
   ; esx = ptr to verts
   movlps   xmm0,[esi]
   mov      esi,.ym4
   mov      eax,0x00ffff00
   call     bar_edge
   pop      ecx ebx edx esi
   add      esi,12
   inc      ecx
   cmp      ecx,.cnt
   jnz      @b

   mov      esp,ebp
   pop      ebp
ret
